Comparison of the BindingSiteSet.txt file between the last releases.
| Release version | Date |
|---|---|
| 10.6 | 2019 July |
| 10.6.3 | - |
| 10.7 | 2020 April |
| 10.8 | 2020 October |
| 10.9 | 2021 April |
| 10.10 | 2022 February |
| 11.0 | 2022 August |
| 11.0.1 (not public) | 2022 September |
| 11.0.2 (not public) | 2022 September |
Notes:
Comments on evidence and confidence issues:
## V10.6 excluded because it doesn't include TU_ID, only TU names which are not unique; otherwise it's almost the same as v10.7 which is included
## Some formatting is done in order to uniformize columns like strand and confidence (uppercase/lowercase, word or symbol +- for the strand, etc)
## Starting v11.0.1 there are 2 evidence columns, here I just merge them so I can compare with older versions
dir_versions <- c("10.7", "10.8", "10.9", "10.10", "11.0", "11.0.1", "11.0.2")
tfbs_sets <- list()
tfbs_versions <- c()
for(v in dir_versions){
version_tag <- paste0("v", v)
set <- read.delim(paste0(dir_releases, "/", v, "/BindingSiteSet.tsv"),
comment.char = "#", header = T, stringsAsFactors = F, na.strings = c("", "NA")) %>%
dplyr::mutate(version = version_tag) %>%
dplyr::mutate(strand = ifelse(strand == "reverse", "-", ifelse(strand == "forward", "+", NA))) %>%
dplyr::mutate(confidence = tolower(confidence)) %>%
dplyr::rowwise() %>%
dplyr::mutate(evidence = ifelse("evidence_function" %in% colnames(.), concat_uniq2(evidence, evidence_function), evidence)) %>%
dplyr::mutate(coords = paste0(start, "_", stop))
assign(paste0("tfbs_set_", version_tag), set)
tfbs_sets[[version_tag]] <- set
tfbs_versions <- c(tfbs_versions, version_tag)
}
all_tfbs <- bind_rows(tfbs_sets) %>%
dplyr::mutate(version = factor(version, levels = tfbs_versions)) %>%
dplyr::mutate(effect = factor(effect, levels = c("+", "-", "?"))) %>%
dplyr::mutate(confidence = factor(confidence, levels = c("weak", "strong", "confirmed")))
all_tfbs_by_evidence <- all_tfbs %>%
tidyr::separate_rows(evidence, sep = ",") %>%
dplyr::mutate(evidence = gsub("\\[|\\]", "", evidence)) %>% #,
tidyr::separate(evidence, c("evidence_code", "evidence_level", "evidence_name"), sep = "\\|") tfbs_summary <- all_tfbs %>%
dplyr::group_by(version) %>%
dplyr::summarise(total = n()) %>%
dplyr::arrange(version)
TFBS_num <- simple_bar(tfbs_summary, "version", "total") +
scale_fill_viridis(discrete = T) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14)) +
labs(x = "Release version", y = "Number of TFBSs", title = "")
TFBS_numDT::datatable(tfbs_summary, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))Comments:
tfbs_ids_dupli <- all_tfbs %>%
dplyr::group_by(version, TFBS_ID) %>%
dplyr::summarise(occurrences = n()) %>%
group_by(version, occurrences) %>%
summarise(tfbs_number = n())%>%
dplyr::mutate(occurrences = factor(occurrences))
###
dodge <- ggplot(tfbs_ids_dupli, aes(fill = occurrences, y = tfbs_number, x = version)) +
geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14)) +
labs(x = "", y = "Number of unique TFBS IDs", title = "TFBS ID duplication in BindingSiteSet.txt across versions")
dodge2 <- ggplot(tfbs_ids_dupli %>% dplyr::filter(!occurrences %in% c("1", "2", "3", "4")), aes(fill = occurrences, y = tfbs_number, x = version)) +
geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14),
legend.position = "none") +
labs(x = "", y = "Number of unique TFBS IDs", title = "...minimum 5 copies")
dodge / dodge2 ## TFBS IDs that have at least 10 entries in a given version
tfbs_ids_dupli_max <- all_tfbs %>%
dplyr::group_by(version, TFBS_ID) %>%
dplyr::summarise(occurrences = n()) %>%
dplyr::filter(occurrences >=10) %>%
dplyr::arrange(desc(occurrences)) %>%
pivot_wider(names_from = version,
values_from = c(occurrences))
DT::datatable(tfbs_ids_dupli_max, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))coordinates <- list()
for (v in tfbs_versions) {
coordinates[[v]] <- unique( (get(paste0("tfbs_set_", v)))$coords)
}
UpSetR::upset(fromList(coordinates), sets = tfbs_versions, order.by = "freq", keep.order = T,
text.scale = c(2,2,2,2,2,2),
queries = list(
list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
list(query = intersects, params = list("v10.7", "v10.8"), color = "red", active = T)
)
)coords_dupli <- all_tfbs %>%
dplyr::group_by(version, coords) %>%
dplyr::summarise(occurrences = n()) %>%
group_by(version, occurrences) %>%
summarise(coords_number = n())%>%
dplyr::mutate(occurrences = factor(occurrences))
###
dodge <- ggplot(coords_dupli, aes(fill = occurrences, y = coords_number, x = version)) +
geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14)) +
labs(x = "", y = "Number of unique TFBS IDs", title = "TFBS ID duplication in BindingSiteSet.txt across versions")
dodge2 <- ggplot(coords_dupli %>% dplyr::filter(!occurrences %in% c("1", "2", "3", "4")), aes(fill = occurrences, y = coords_number, x = version)) +
geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14),
legend.position = "none") +
labs(x = "", y = "Number of unique TFBS IDs", title = "...minimum 5 copies")
dodge / dodge2 ## coords that have at least 10 entries in a given version
coords_dupli_max <- all_tfbs %>%
dplyr::group_by(version, coords) %>%
dplyr::summarise(occurrences = n()) %>%
dplyr::filter(occurrences >=10) %>%
dplyr::arrange(desc(occurrences)) %>%
pivot_wider(names_from = version,
values_from = c(occurrences))
DT::datatable(coords_dupli_max, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))ggplot(all_tfbs, aes(x = distance_TSS, y = version, fill = version)) +
ggridges::geom_density_ridges(color = "white") +
ggridges::theme_ridges() +
scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme(legend.position = "none") +
xlim(-1000, 1000)ggplot(all_tfbs, aes(x = distance_gene, y = version, fill = version)) +
ggridges::geom_density_ridges(color = "white") +
ggridges::theme_ridges() +
scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme(legend.position = "none") +
xlim(-1000, 1000)tfbs_effect_long <- all_tfbs %>%
group_by(version, effect) %>%
summarise(value = n())
dodge <- ggplot(tfbs_effect_long, aes(fill = effect, y = value, x = version)) +
geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
labs(x = "", y = "Number of TFBS", title = "")
stack <- ggplot(tfbs_effect_long, aes(fill = effect, y = value, x = version)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
labs(x = "", y = "Number of TFBS", title = "")
line <- ggplot(tfbs_effect_long, aes(group = effect, y = value, x = version)) +
geom_line(aes(color = effect)) +
geom_point(size = 2, aes(color = effect)) +
scale_color_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
labs(x = "Release version", y = "Number of TFBS", title = "")
## ggplotly to make it interactive
# fig0 <- ggplotly(TFBS_num)
fig1 <- ggplotly(dodge)
fig2 <- ggplotly(stack)
fig3 <- ggplotly(line)
subplot(fig1, fig2, fig3, nrows=3)effect_summary <- all_tfbs %>%
group_by(version, effect) %>%
summarise(value = n()) %>%
data.frame() %>%
pivot_wider(names_from = version, values_from = c(value)) %>%
arrange(effect)
DT::datatable(effect_summary, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))tfbs_evidence_long <- all_tfbs_by_evidence %>%
group_by(version, evidence_code, evidence_name) %>%
summarise(value = n())
evidence_palette <- random_palette(length(unique(all_tfbs_by_evidence$evidence_code)))
##----
dodge <- ggplot(tfbs_evidence_long, aes(fill = evidence_code, y = value, x = version, group = evidence_name)) +
geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
scale_fill_manual(values = evidence_palette, drop = F, na.value = "gray") +
# scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
labs(x = "", y = "Number of TFBSs", title = "")
stack <- ggplot(tfbs_evidence_long, aes(fill = evidence_code, y = value, x = version, group = evidence_name)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_manual(values = evidence_palette, drop = F, na.value = "gray") +
# scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
labs(x = "", y = "Number of evidence", title = "")
line <- ggplot(tfbs_evidence_long, aes(group = evidence_name, y = value, x = version)) +
geom_line(aes(color = evidence_code)) +
scale_color_manual(values = evidence_palette, drop = F, na.value = "gray") +
geom_point(size = 2, aes(color = evidence_code)) +
# scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
labs(x = "Release version", y = "Number of TFBS", title = "")
fig0 <- ggplotly(TFBS_num)
fig1 <- ggplotly(dodge)
fig2 <- ggplotly(stack)
fig3 <- ggplotly(line)
subplot(fig0, fig1, fig2, fig3, nrows=4)tfbs_evidence_table <- all_tfbs_by_evidence %>%
group_by(evidence_code, evidence_name) %>%
summarise(version = concat_uniq(version))
DT::datatable(tfbs_evidence_table, rownames= FALSE, options = list(searching = TRUE, lengthChange = FALSE, pageLength = 10))tfbs_evidence_shared <- list()
for (v in tfbs_versions) {
tfbs_evidence_shared[[v]] <- unique((all_tfbs_by_evidence %>% dplyr::filter(version == v))$evidence_code)
}
UpSetR::upset(fromList(tfbs_evidence_shared), sets = tfbs_versions, order.by = "freq", keep.order = T,
text.scale = c(2,2,2,2,2,2),
# queries = list(list(query = intersects, params = list("v10.7"), color = "red", active = T),
# list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T))
)## Revise congruency between table and upset plot (table has CE listed as only present in 10.8, shjould have 10.7 too)tfbs_confidence_long <- all_tfbs %>%
group_by(version, confidence) %>%
summarise(value = n())
dodge <- ggplot(tfbs_confidence_long, aes(fill = confidence, y = value, x = version)) +
geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
labs(x = "", y = "Number of TFBS", title = "")
stack <- ggplot(tfbs_confidence_long, aes(fill = confidence, y = value, x = version)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
labs(x = "", y = "Number of TFBS", title = "")
line <- ggplot(tfbs_confidence_long, aes(group = confidence, y = value, x = version)) +
geom_line(aes(color = confidence)) +
geom_point(size = 2, aes(color = confidence)) +
scale_color_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
labs(x = "Release version", y = "Number of TFBS", title = "")
## ggplotly to make it interactive
# fig0 <- ggplotly(TFBS_num)
fig1 <- ggplotly(dodge)
fig2 <- ggplotly(stack)
fig3 <- ggplotly(line)
subplot(fig1, fig2, fig3, nrows=3)confidence_summary <- all_tfbs %>%
group_by(version, confidence) %>%
summarise(value = n()) %>%
mutate(confidence = ifelse(is.na(confidence), "null", as.character(confidence))) %>%
data.frame() %>%
pivot_wider(names_from = version, values_from = c(value)) %>%
mutate(across(starts_with('v'), ~replace_na(.,0))) %>%
bind_rows(summarise(.,
across(where(is.numeric), sum),
across(where(is.character), ~"total"))) %>%
mutate(confidence = factor(confidence, levels = c("weak", "strong", "confirmed", "null", "total"))) %>%
arrange(confidence)
DT::datatable(confidence_summary, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))ri_ids <- list()
for (v in tfbs_versions) {
ri_ids[[v]] <- unique( (get(paste0("tfbs_set_", v)))$RI_ID)
}
UpSetR::upset(fromList(ri_ids), sets = tfbs_versions, order.by = "freq", keep.order = T,
text.scale = c(2,2,2,2,2,2),
queries = list(
list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
list(query = intersects, params = list("v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
list(query = intersects, params = list("v10.7", "v10.8", "v10.9", "v10.10"), color = "red", active = T),
list(query = intersects, params = list("v10.7", "v10.8"), color = "red", active = T)
)
)ris_ids_dupli <- all_tfbs %>%
dplyr::group_by(version, RI_ID) %>%
dplyr::summarise(occurrences = n()) %>%
group_by(version, occurrences) %>%
summarise(ris_number = n())%>%
dplyr::mutate(occurrences = factor(occurrences))
###
dodge <- ggplot(ris_ids_dupli, aes(fill = occurrences, y = ris_number, x = version)) +
geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14)) +
labs(x = "", y = "Number of unique RI IDs", title = "RI ID duplication in BindingSiteSet.txt across versions")
dodge2 <- ggplot(ris_ids_dupli %>% dplyr::filter(!occurrences %in% c("1")), aes(fill = occurrences, y = ris_number, x = version)) +
geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14),
legend.position = "none") +
labs(x = "", y = "Number of unique RI IDs", title = "...minimum 2 copies")
dodge / dodge2 tf_ids <- list()
for (v in tfbs_versions) {
tf_ids[[v]] <- unique( (get(paste0("tfbs_set_", v)))$TF_ID)
}
UpSetR::upset(fromList(tf_ids), sets = tfbs_versions, order.by = "freq", keep.order = T,
text.scale = c(2,2,2,2,2,2),
sets.x.label = "Number of unique TF IDs",
queries = list(
list(query = intersects, params = list("v10.8", "v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
list(query = intersects, params = list("v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
list(query = intersects, params = list("v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
list(query = intersects, params = list("v10.7", "v10.8", "v10.9"), color = "red", active = T),
list(query = intersects, params = list("v10.7"), color = "red", active = T))
)tf_ids_gone <- rownames(fromList(tf_ids) %>% filter(v10.7 == 1 & v11.0.2 == 0))
tf_names <- list()
for (v in tfbs_versions) {
tf_names[[v]] <- unique( (get(paste0("tfbs_set_", v)))$TF_name)
}
UpSetR::upset(fromList(tf_names), sets = tfbs_versions, order.by = "freq", keep.order = T,
text.scale = c(2,2,2,2,2,2),
sets.x.label = "Number of unique TF names",
queries = list(
list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
list(query = intersects, params = list("v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
list(query = intersects, params = list("v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
list(query = intersects, params = list("v10.7", "v10.8", "v10.9"), color = "red", active = T))
)tf_names_gone <- rownames(fromList(tf_names) %>% filter(v10.7 == 1 & v11.0.2 == 0))promoter_name <- list()
for (v in tfbs_versions) {
promoter_name[[v]] <- unique( (get(paste0("tfbs_set_", v)))$promoter)
}
UpSetR::upset(fromList(promoter_name), sets = tfbs_versions, order.by = "freq", keep.order = T,
text.scale = c(2,2,2,2,2,2),
sets.x.label = "Number of unique promoter names",
queries = list(
list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
list(query = intersects, params = list("v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
list(query = intersects, params = list("v10.7", "v10.8", "v10.9"), color = "red", active = T))
)tu_ids <- list()
for (v in tfbs_versions) {
tu_ids[[v]] <- unique( (get(paste0("tfbs_set_", v)))$TU_ID)
}
UpSetR::upset(fromList(tu_ids), sets = tfbs_versions, order.by = "freq", keep.order = T,
text.scale = c(2,2,2,2,2,2),
sets.x.label = "Number of unique TU IDs",
queries = list(
list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
list(query = intersects, params = list("v10.7", "v10.8", "v10.9"), color = "red", active = T))
)tu_names <- list()
for (v in tfbs_versions) {
tu_names[[v]] <- unique( (get(paste0("tfbs_set_", v)))$TU_name)
}
UpSetR::upset(fromList(tu_names), sets = tfbs_versions, order.by = "freq", keep.order = T,
text.scale = c(2,2,2,2,2,2),
sets.x.label = "Number of unique TU names",
queries = list(
list(query = intersects, params = list("v10.8", "v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
list(query = intersects, params = list("v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
list(query = intersects, params = list("v10.7", "v10.8", "v10.9"), color = "red", active = T))
)Notes:
The following table displays the differences. Many (most?) differences seem to be caused by different promoter names
Ex.
## Join versions 10.7 and 11.0.2 by TFBS_ID and promoter name
tfbs_join_107_1102 <- tfbs_set_v10.7 %>%
dplyr::full_join(tfbs_set_v11.0.2, by = c("TFBS_ID", "promoter"), suffix = c("_10.7", "_11.0.2")) %>%
dplyr::arrange(TFBS_ID) %>%
dplyr::select(TFBS_ID, promoter, starts_with("TF_name"), starts_with("TU"), starts_with("coords"), starts_with("evidence_1"), starts_with("confidence"))
# select(TFBS_ID, promoter, everything())
tfbs_matches_107_1102 <- tfbs_join_107_1102 %>% na.omit
tfbs_differences_107_1102 <- dplyr::setdiff(tfbs_join_107_1102, tfbs_matches_107_1102)
write.table(tfbs_join_107_1102, file = paste0(dir_results, "/TFBS_full_join_107_1102.tsv"), quote = F, row.names = F, col.names = T, sep = "\t")
write.table(tfbs_differences_107_1102, file = paste0(dir_results, "/TFBS_differences_107_1102.tsv"), quote = F, row.names = F, col.names = T, sep = "\t")
DT::datatable(tfbs_differences_107_1102, rownames= FALSE, options = list(searching = TRUE, lengthChange = FALSE, pageLength = 5,
columnDefs = list(list(width = '200px', targets = c(11,12)))
))dir_versions <- c("10.6.3", "10.7", "10.8", "10.9", "11.0", "11.0.1")
tfgnw_sets <- list()
tfgnw_versions <- c()
for(v in dir_versions){
version_tag <- paste0("v", v)
set <- read.delim(paste0(dir_releases, "/", v, "/network_tf_gene.tsv"),
comment.char = "#", header = T, stringsAsFactors = F, na.strings = c("", "NA")) %>%
dplyr::mutate(effect = ifelse(effect == "repressor", "-", ifelse(effect == "activator", "+", ifelse(effect == "unknown", "?", effect)))) %>%
dplyr::mutate(version = version_tag) %>%
dplyr::rowwise() %>%
dplyr::mutate(evidence = ifelse("evidence_function" %in% colnames(.), concat_uniq2(evidence, evidence_function), evidence)) %>%
dplyr::mutate(pairs = paste0(TF_name, "_", gene_name))
assign(paste0("tfgnw_set_", version_tag), set)
tfgnw_sets[[version_tag]] <- set
tfgnw_versions <- c(tfgnw_versions, version_tag)
}
all_tfgnw <- bind_rows(tfgnw_sets) %>%
dplyr::mutate(version = factor(version, levels = tfgnw_versions)) %>%
dplyr::mutate(effect = factor(effect, levels = c("+", "-", "?"))) %>%
dplyr::mutate(confidence = tolower(confidence)) %>%
dplyr::mutate(confidence = factor(confidence, levels = c("weak", "strong", "confirmed")))
all_tfgnw_by_evidence <- all_tfgnw %>%
tidyr::separate_rows(evidence, sep = ",") %>%
dplyr::mutate(evidence = trimws(gsub("\\[|\\]", "", evidence))) %>%
dplyr::mutate(evidence_code = "",
evidence_level = "",
evidence_name = "") %>%
dplyr::rowwise() %>%
dplyr::mutate(evidence_code = ifelse(grepl("\\|", evidence), stringr::str_split(evidence, "\\|")[[1]][1], evidence),
evidence_level = ifelse(grepl("\\|", evidence), stringr::str_split(evidence, "\\|")[[1]][2], NA),
evidence_name = ifelse(grepl("\\|", evidence), stringr::str_split(evidence, "\\|")[[1]][3], NA)
)NB: additional TAB characters at the end of each line cause parsing issues
Comparison of the network_tf_gene.txt file between the last releases.
| Release version | Date |
|---|---|
| 10.6 (NA) | 2019 July |
| 10.6.3 | - |
| 10.7 | 2020 April |
| 10.8 | 2020 October |
| 10.9 | 2021 April |
| 10.10 (NA) | 2022 February |
| 11.0 | 2022 August |
| 11.0.1 (not public) | 2022 September |
tfgnw_summary <- all_tfgnw %>%
dplyr::group_by(version) %>%
dplyr::summarise(total = n()) %>%
dplyr::arrange(version)
tfgnw_num <- simple_bar(tfgnw_summary, "version", "total") +
scale_fill_viridis(discrete = T) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14)) +
labs(x = "Release version", y = "Number of TF-gene entries", title = "")
tfgnw_numDT::datatable(tfgnw_summary, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))pairs <- list()
for (v in tfgnw_versions) {
pairs[[v]] <- unique( (get(paste0("tfgnw_set_", v)))$pairs)
}
UpSetR::upset(fromList(pairs), sets = tfgnw_versions, order.by = "freq", keep.order = T,
text.scale = c(2,2,2,2,2,2),
# queries = list(
# list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
# list(query = intersects, params = list("v10.7", "v10.8"), color = "red", active = T)
# )
)coords_dupli <- all_tfgnw %>%
dplyr::group_by(version, pairs) %>%
dplyr::summarise(occurrences = n()) %>%
group_by(version, occurrences) %>%
summarise(pairs_number = n())%>%
dplyr::mutate(occurrences = factor(occurrences))
###
dodge <- ggplot(coords_dupli, aes(fill = occurrences, y = pairs_number, x = version)) +
geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14)) +
labs(x = "", y = "Number of unique TF-gene pairs", title = "TF-gene pairs duplication in network_tf_gene.txt across versions")
dodge2 <- ggplot(coords_dupli %>% dplyr::filter(!occurrences %in% c("1", "2", "3", "4")), aes(fill = occurrences, y = pairs_number, x = version)) +
geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14),
legend.position = "none") +
labs(x = "", y = "Number of unique TF-gene pairs", title = "...minimum 5 copies")
dodge / dodge2 ## pairs that have at least 10 entries in a given version
pairs_dupli_max <- all_tfgnw %>%
dplyr::group_by(version, pairs) %>%
dplyr::summarise(occurrences = n()) %>%
dplyr::filter(occurrences >=10) %>%
dplyr::arrange(desc(occurrences)) %>%
pivot_wider(names_from = version,
values_from = c(occurrences))
DT::datatable(pairs_dupli_max, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))tf_names <- list()
for (v in tfgnw_versions) {
tf_names[[v]] <- unique( (get(paste0("tfgnw_set_", v)))$TF_name)
}
UpSetR::upset(fromList(tf_names), sets = tfgnw_versions, order.by = "freq", keep.order = T,
text.scale = c(2,2,2,2,2,2),
sets.x.label = "Number of unique TF names",
# queries = list(list(query = intersects, params = list("v10.7", "v10.8", "v10.9"), color = "red", active = T))
)gene_names <- list()
for (v in tfgnw_versions) {
gene_names[[v]] <- unique( (get(paste0("tfgnw_set_", v)))$gene_name)
}
UpSetR::upset(fromList(gene_names), sets = tfgnw_versions, order.by = "freq", keep.order = T,
text.scale = c(2,2,2,2,2,2),
sets.x.label = "Number of unique TF names",
# queries = list(list(query = intersects, params = list("v10.7", "v10.8", "v10.9"), color = "red", active = T))
)tfgnw_effect_long <- all_tfgnw %>%
group_by(version, effect) %>%
summarise(value = n())
dodge <- ggplot(tfgnw_effect_long, aes(fill = effect, y = value, x = version)) +
geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
labs(x = "", y = "Number of tfgnw", title = "")
stack <- ggplot(tfgnw_effect_long, aes(fill = effect, y = value, x = version)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
labs(x = "", y = "Number of tfgnw", title = "")
line <- ggplot(tfgnw_effect_long, aes(group = effect, y = value, x = version)) +
geom_line(aes(color = effect)) +
geom_point(size = 2, aes(color = effect)) +
scale_color_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
labs(x = "Release version", y = "Number of tfgnw", title = "")
## ggplotly to make it interactive
# fig0 <- ggplotly(tfgnw_num)
fig1 <- ggplotly(dodge)
fig2 <- ggplotly(stack)
fig3 <- ggplotly(line)
subplot(fig1, fig2, fig3, nrows=3)effect_summary <- all_tfgnw %>%
group_by(version, effect) %>%
summarise(value = n()) %>%
data.frame() %>%
pivot_wider(names_from = version, values_from = c(value)) %>%
arrange(effect)
DT::datatable(effect_summary, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))tfgnw_evidence_long <- all_tfgnw_by_evidence %>%
group_by(version, evidence_code) %>%
summarise(value = n(), evidence_name = concat_uniq(evidence_name)) ## ojo que solo la 11.0.1 tiene evidence name
evidence_palette <- random_palette(length(unique(all_tfgnw_by_evidence$evidence_code)))
##----
dodge <- ggplot(tfgnw_evidence_long, aes(fill = evidence_code, y = value, x = version, group = evidence_code)) +
geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
scale_fill_manual(values = evidence_palette, drop = F, na.value = "gray") +
# scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
labs(x = "", y = "Number of tfgnws", title = "")
stack <- ggplot(tfgnw_evidence_long, aes(fill = evidence_code, y = value, x = version, group = evidence_code)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_manual(values = evidence_palette, drop = F, na.value = "gray") +
# scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
labs(x = "", y = "Number of evidence", title = "")
line <- ggplot(tfgnw_evidence_long, aes(group = evidence_code, y = value, x = version)) +
geom_line(aes(color = evidence_code)) +
scale_color_manual(values = evidence_palette, drop = F, na.value = "gray") +
geom_point(size = 2, aes(color = evidence_code)) +
# scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
labs(x = "Release version", y = "Number of tfgnw", title = "")
fig0 <- ggplotly(tfgnw_num)
fig1 <- ggplotly(dodge)
fig2 <- ggplotly(stack)
fig3 <- ggplotly(line)
subplot(fig0, fig1, fig2, fig3, nrows=4)tfgnw_evidence_table <- all_tfgnw_by_evidence %>%
group_by(evidence_code) %>%
summarise(value = n(), evidence_name = concat_uniq(evidence_name), version = concat_uniq(version)) ## ojo que solo la 11.0.1 tiene evidence name
DT::datatable(tfgnw_evidence_table, rownames= FALSE, options = list(searching = TRUE, lengthChange = FALSE, pageLength = 10))## to do remove empty string from evidence tabletfgnw_evidence_shared <- list()
for (v in tfgnw_versions) {
tfgnw_evidence_shared[[v]] <- unique((all_tfgnw_by_evidence %>% dplyr::filter(version == v))$evidence_code)
}
UpSetR::upset(fromList(tfgnw_evidence_shared), sets = tfgnw_versions, order.by = "freq", keep.order = T,
text.scale = c(2,2,2,2,2,2),
# queries = list(list(query = intersects, params = list("v10.7"), color = "red", active = T),
# list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T))
)## Revise congruency between table and upset plot (table has CE listed as only present in 10.8, shjould have 10.7 too)tfgnw_confidence_long <- all_tfgnw %>%
group_by(version, confidence) %>%
summarise(value = n())
dodge <- ggplot(tfgnw_confidence_long, aes(fill = confidence, y = value, x = version)) +
geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
labs(x = "", y = "Number of tfgnw", title = "")
stack <- ggplot(tfgnw_confidence_long, aes(fill = confidence, y = value, x = version)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
labs(x = "", y = "Number of tfgnw", title = "")
line <- ggplot(tfgnw_confidence_long, aes(group = confidence, y = value, x = version)) +
geom_line(aes(color = confidence)) +
geom_point(size = 2, aes(color = confidence)) +
scale_color_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
labs(x = "Release version", y = "Number of tfgnw", title = "")
## ggplotly to make it interactive
# fig0 <- ggplotly(tfgnw_num)
fig1 <- ggplotly(dodge)
fig2 <- ggplotly(stack)
fig3 <- ggplotly(line)
subplot(fig1, fig2, fig3, nrows=3)confidence_summary <- all_tfgnw %>%
group_by(version, confidence) %>%
summarise(value = n()) %>%
mutate(confidence = ifelse(is.na(confidence), "null", as.character(confidence))) %>%
data.frame() %>%
pivot_wider(names_from = version, values_from = c(value)) %>%
mutate(across(starts_with('v'), ~replace_na(.,0))) %>%
bind_rows(summarise(.,
across(where(is.numeric), sum),
across(where(is.character), ~"total"))) %>%
mutate(confidence = factor(confidence, levels = c("weak", "strong", "confirmed", "null", "total"))) %>%
arrange(confidence)
DT::datatable(confidence_summary, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))dir_versions <- c("10.6.3", "10.7", "10.8", "10.9", "11.0", "11.0.1")
tftunw_sets <- list()
tftunw_versions <- c()
for(v in dir_versions){
version_tag <- paste0("v", v)
set <- read.delim(paste0(dir_releases, "/", v, "/network_tf_tu.tsv"),
comment.char = "#", header = T, stringsAsFactors = F, na.strings = c("", "NA")) %>%
dplyr::mutate(effect = ifelse(effect == "repressor", "-", ifelse(effect == "activator", "+", ifelse(effect == "unknown", "?", effect)))) %>%
dplyr::mutate(version = version_tag) %>%
dplyr::rowwise() %>%
dplyr::mutate(evidence = ifelse("evidence_function" %in% colnames(.), concat_uniq2(evidence, evidence_function), evidence)) %>%
dplyr::mutate(pairs = paste0(TF_name, "_", TU_name))
assign(paste0("tftunw_set_", version_tag), set)
tftunw_sets[[version_tag]] <- set
tftunw_versions <- c(tftunw_versions, version_tag)
}
all_tftunw <- bind_rows(tftunw_sets) %>%
dplyr::mutate(version = factor(version, levels = tftunw_versions)) %>%
dplyr::mutate(effect = factor(effect, levels = c("+", "-", "?"))) %>%
dplyr::mutate(confidence = tolower(confidence)) %>%
dplyr::mutate(confidence = factor(confidence, levels = c("weak", "strong", "confirmed")))
all_tftunw_by_evidence <- all_tftunw %>%
tidyr::separate_rows(evidence, sep = ",") %>%
dplyr::mutate(evidence = trimws(gsub("\\[|\\]", "", evidence))) %>%
dplyr::mutate(evidence_code = "",
evidence_level = "",
evidence_name = "") %>%
dplyr::rowwise() %>%
dplyr::mutate(evidence_code = ifelse(grepl("\\|", evidence), stringr::str_split(evidence, "\\|")[[1]][1], evidence),
evidence_level = ifelse(grepl("\\|", evidence), stringr::str_split(evidence, "\\|")[[1]][2], NA),
evidence_name = ifelse(grepl("\\|", evidence), stringr::str_split(evidence, "\\|")[[1]][3], NA)
)NB: additional TAB characters at the end of each line cause parsing issues
Comparison of the network_tf_gene.txt file between the last releases.
| Release version | Date |
|---|---|
| 10.6 (NA) | 2019 July |
| 10.6.3 | - |
| 10.7 | 2020 April |
| 10.8 | 2020 October |
| 10.9 | 2021 April |
| 10.10 (NA) | 2022 February |
| 11.0 | 2022 August |
| 11.0.1 (not public) | 2022 September |
tftunw_summary <- all_tftunw %>%
dplyr::group_by(version) %>%
dplyr::summarise(total = n()) %>%
dplyr::arrange(version)
tftunw_num <- simple_bar(tftunw_summary, "version", "total") +
scale_fill_viridis(discrete = T) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14)) +
labs(x = "Release version", y = "Number of TF-gene entries", title = "")
tftunw_numDT::datatable(tftunw_summary, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))pairs <- list()
for (v in tftunw_versions) {
pairs[[v]] <- unique( (get(paste0("tftunw_set_", v)))$pairs)
}
UpSetR::upset(fromList(pairs), sets = tftunw_versions, order.by = "freq", keep.order = T,
text.scale = c(2,2,2,2,2,2),
# queries = list(
# list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
# list(query = intersects, params = list("v10.7", "v10.8"), color = "red", active = T)
# )
)coords_dupli <- all_tftunw %>%
dplyr::group_by(version, pairs) %>%
dplyr::summarise(occurrences = n()) %>%
group_by(version, occurrences) %>%
summarise(pairs_number = n())%>%
dplyr::mutate(occurrences = factor(occurrences))
###
dodge <- ggplot(coords_dupli, aes(fill = occurrences, y = pairs_number, x = version)) +
geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14)) +
labs(x = "", y = "Number of unique TF-gene pairs", title = "TF-gene pairs duplication in network_tf_gene.txt across versions")
dodge tf_names <- list()
for (v in tftunw_versions) {
tf_names[[v]] <- unique( (get(paste0("tftunw_set_", v)))$TF_name)
}
UpSetR::upset(fromList(tf_names), sets = tftunw_versions, order.by = "freq", keep.order = T,
text.scale = c(2,2,2,2,2,2),
sets.x.label = "Number of unique TF names",
# queries = list(list(query = intersects, params = list("v10.7", "v10.8", "v10.9"), color = "red", active = T))
)TU_names <- list()
for (v in tftunw_versions) {
TU_names[[v]] <- unique( (get(paste0("tftunw_set_", v)))$TU_name)
}
UpSetR::upset(fromList(TU_names), sets = tftunw_versions, order.by = "freq", keep.order = T,
text.scale = c(2,2,2,2,2,2),
sets.x.label = "Number of unique TU names",
# queries = list(list(query = intersects, params = list("v10.7", "v10.8", "v10.9"), color = "red", active = T))
)tftunw_effect_long <- all_tftunw %>%
group_by(version, effect) %>%
summarise(value = n())
dodge <- ggplot(tftunw_effect_long, aes(fill = effect, y = value, x = version)) +
geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
labs(x = "", y = "Number of tftunw", title = "")
stack <- ggplot(tftunw_effect_long, aes(fill = effect, y = value, x = version)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
labs(x = "", y = "Number of tftunw", title = "")
line <- ggplot(tftunw_effect_long, aes(group = effect, y = value, x = version)) +
geom_line(aes(color = effect)) +
geom_point(size = 2, aes(color = effect)) +
scale_color_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
labs(x = "Release version", y = "Number of tftunw", title = "")
## ggplotly to make it interactive
# fig0 <- ggplotly(tftunw_num)
fig1 <- ggplotly(dodge)
fig2 <- ggplotly(stack)
fig3 <- ggplotly(line)
subplot(fig1, fig2, fig3, nrows=3)effect_summary <- all_tftunw %>%
group_by(version, effect) %>%
summarise(value = n()) %>%
data.frame() %>%
pivot_wider(names_from = version, values_from = c(value)) %>%
arrange(effect)
DT::datatable(effect_summary, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))tftunw_evidence_long <- all_tftunw_by_evidence %>%
group_by(version, evidence_code) %>%
summarise(value = n(), evidence_name = concat_uniq(evidence_name)) ## ojo que solo la 11.0.1 tiene evidence name
evidence_palette <- random_palette(length(unique(all_tftunw_by_evidence$evidence_code)))
##----
dodge <- ggplot(tftunw_evidence_long, aes(fill = evidence_code, y = value, x = version, group = evidence_code)) +
geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
scale_fill_manual(values = evidence_palette, drop = F, na.value = "gray") +
# scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
labs(x = "", y = "Number of tftunws", title = "")
stack <- ggplot(tftunw_evidence_long, aes(fill = evidence_code, y = value, x = version, group = evidence_code)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_manual(values = evidence_palette, drop = F, na.value = "gray") +
# scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
labs(x = "", y = "Number of evidence", title = "")
line <- ggplot(tftunw_evidence_long, aes(group = evidence_code, y = value, x = version)) +
geom_line(aes(color = evidence_code)) +
scale_color_manual(values = evidence_palette, drop = F, na.value = "gray") +
geom_point(size = 2, aes(color = evidence_code)) +
# scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
labs(x = "Release version", y = "Number of tftunw", title = "")
fig0 <- ggplotly(tftunw_num)
fig1 <- ggplotly(dodge)
fig2 <- ggplotly(stack)
fig3 <- ggplotly(line)
subplot(fig0, fig1, fig2, fig3, nrows=4)tftunw_evidence_table <- all_tftunw_by_evidence %>%
group_by(evidence_code) %>%
summarise(value = n(), evidence_name = concat_uniq(evidence_name), version = concat_uniq(version)) ## ojo que solo la 11.0.1 tiene evidence name
DT::datatable(tftunw_evidence_table, rownames= FALSE, options = list(searching = TRUE, lengthChange = FALSE, pageLength = 10))## to do remove empty string from evidence tabletftunw_evidence_shared <- list()
for (v in tftunw_versions) {
tftunw_evidence_shared[[v]] <- unique((all_tftunw_by_evidence %>% dplyr::filter(version == v))$evidence_code)
}
UpSetR::upset(fromList(tftunw_evidence_shared), sets = tftunw_versions, order.by = "freq", keep.order = T,
text.scale = c(2,2,2,2,2,2),
# queries = list(list(query = intersects, params = list("v10.7"), color = "red", active = T),
# list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T))
)## Revise congruency between table and upset plot (table has CE listed as only present in 10.8, shjould have 10.7 too)tftunw_confidence_long <- all_tftunw %>%
group_by(version, confidence) %>%
summarise(value = n())
dodge <- ggplot(tftunw_confidence_long, aes(fill = confidence, y = value, x = version)) +
geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
labs(x = "", y = "Number of tftunw", title = "")
stack <- ggplot(tftunw_confidence_long, aes(fill = confidence, y = value, x = version)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
labs(x = "", y = "Number of tftunw", title = "")
line <- ggplot(tftunw_confidence_long, aes(group = confidence, y = value, x = version)) +
geom_line(aes(color = confidence)) +
geom_point(size = 2, aes(color = confidence)) +
scale_color_viridis(discrete = T, na.value = "gray", drop = F) +
theme_minimal() +
theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
labs(x = "Release version", y = "Number of tftunw", title = "")
## ggplotly to make it interactive
# fig0 <- ggplotly(tftunw_num)
fig1 <- ggplotly(dodge)
fig2 <- ggplotly(stack)
fig3 <- ggplotly(line)
subplot(fig1, fig2, fig3, nrows=3)confidence_summary <- all_tftunw %>%
group_by(version, confidence) %>%
summarise(value = n()) %>%
mutate(confidence = ifelse(is.na(confidence), "null", as.character(confidence))) %>%
data.frame() %>%
pivot_wider(names_from = version, values_from = c(value)) %>%
mutate(across(starts_with('v'), ~replace_na(.,0))) %>%
bind_rows(summarise(.,
across(where(is.numeric), sum),
across(where(is.character), ~"total"))) %>%
mutate(confidence = factor(confidence, levels = c("weak", "strong", "confirmed", "null", "total"))) %>%
arrange(confidence)
DT::datatable(confidence_summary, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))–>
save.image(file = paste0("Binding_dataset_report.Rdata"))